Set Options

knitr::opts_chunk$set(
  warning = TRUE, # show warnings during codebook generation
  message = TRUE, # show messages during codebook generation
  error = TRUE, # do not interrupt codebook generation in case of errors,
                # usually better for debugging
  echo = TRUE  # show R code
)
ggplot2::theme_set(ggplot2::theme_bw())

library(rio)
library(labelled)

Prep Data

library(codebook)
## 
## Attaching package: 'codebook'
## The following object is masked from 'package:labelled':
## 
##     to_factor
codebook_data <- import("../data_processing/output_data/trial_data/sr_trial_data.csv.gz")

# cat(paste(names(codebook_data), collapse = " = '', \n"))

var_label(codebook_data) <- list(
  observation = "Unique participant ID number.",
  fix_sender = "The sender_id column in a sortable format. You can sort the data by observation and this column to ensure it is in trial order.", 
  response = "Participant response to the trial.",
  response_action = "Keypress used to indicate their response to the trial.", 
  ended_on = "How the trial ended (timeout, form submit, completion, response).",
  duration = "The duration in milliseconds of the entire trial from time shown to time end. This variable was set to NA if the trial was incorrectly answered, too long (3000+ms) or too short (<=160ms). The original duration is also preserved.",
  time_run = "The time in milliseconds from the start of the experiment it took to run (start to display) the trial.",
  time_render = "The time in milliseconds from the start of the experiment it took to render (prepare, get ready for) the trial.",
  time_show = "The time in milliseconds from the start of the experiment it took to show the trial on the screen to the participant.",
  time_end = "The time in milliseconds from the start of the experiment it took to end the current trial.",
  time_commit = "The time in milliseconds from the start of the experiment it took to save the current trial.",
  timestamp = "The approximate timestamp of the trial in UTC server time.",
  time_switch = "The time in milliseconds from the start of the experiment it took to switch between the previous trial and the current trial.",
  word = "The string of letters/characters shown on the screen for the trial.",
  class = "The type of stimuli shown on the screen (word or nonword).",
  correct_response = "The correct answer for the trial.",
  correct = "A logical variable indicating if the participant got the trial answer correct.",
  original_duration = "The duration in milliseconds of the entire trial from time shown to time end.",
  Z_RT = 'The Z-scored response latency (by participant) of the duration column.', 
  keep = 'If the trial level data should be kept based on our exclusion rules (not too long < 3000 ms, not too short > 160ms, correctly answered).', 
  keep_participant = 'If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials SEEN.', 
  keep_participant_answered = 'If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials ANSWERED.')

metadata(codebook_data)$name <- "Semantic Priming Across Many Languages Trial Level Data"
metadata(codebook_data)$description <- "This dataset contains the processed trial data of the data collection from the SPAML project (example in Serbian, but all datasets are the same). The data is presented here in long format, with each trial representing one row in the data. All other trials (consent/demographic screens), fillers, fixation crosses, etc. have been excluded in this version. 

Semantic priming has been studied for nearly 50 years across various experimental manipulations and theoretical frameworks. These studies provide insight into the cognitive underpinnings of semantic representations in both healthy and clinical populations; however, they have suffered from several issues including generally low sample sizes and a lack of diversity in linguistic implementations. Here, we will test the size and the variability of the semantic priming effect across ten languages by creating a large database of semantic priming values, based on an adaptive sampling procedure. Differences in response latencies between related word-pair conditions and unrelated word-pair conditions (i.e., difference score confidence interval is greater than zero) will allow quantifying evidence for semantic priming, whereas improvements in model fit with the addition of a random intercept for language will provide support for variability in semantic priming across languages."
metadata(codebook_data)$identifier <- "https://doi.org/10.5281/zenodo.10888833"
metadata(codebook_data)$creator <- "Erin M. Buchanan"
metadata(codebook_data)$citation <- "Buchanan, E., Cuccolo, K., Heyman, T., Iyer, A., Coles, N., Lewis Jr, N., Peters, K., van Berkel, N., Taylor, J., Van't Veer, A. E., Montefinese, M., Valentine, K. D., Maxwell, N., Türkan, B. N., Williams, G., Oliveros-Chacana, J. C., Röer, J., Fini, C., Acar, O., … Lewis, S. C. (2024). SemanticPriming/SPAML: SPAML v1 Data Release (v1.0.0) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.10888833"
metadata(codebook_data)$url <- "https://github.com/SemanticPriming/SPAML/releases/"
metadata(codebook_data)$datePublished <- "2024-05-01"
metadata(codebook_data)$temporalCoverage <- "2022-2024" 
metadata(codebook_data)$spatialCoverage <- "Online" 

Create codebook

codebook(codebook_data)

Metadata

Description

Dataset name: Semantic Priming Across Many Languages Trial Level Data

This dataset contains the processed trial data of the data collection from the SPAML project (example in Serbian, but all datasets are the same). The data is presented here in long format, with each trial representing one row in the data. All other trials (consent/demographic screens), fillers, fixation crosses, etc. have been excluded in this version.

Semantic priming has been studied for nearly 50 years across various experimental manipulations and theoretical frameworks. These studies provide insight into the cognitive underpinnings of semantic representations in both healthy and clinical populations; however, they have suffered from several issues including generally low sample sizes and a lack of diversity in linguistic implementations. Here, we will test the size and the variability of the semantic priming effect across ten languages by creating a large database of semantic priming values, based on an adaptive sampling procedure. Differences in response latencies between related word-pair conditions and unrelated word-pair conditions (i.e., difference score confidence interval is greater than zero) will allow quantifying evidence for semantic priming, whereas improvements in model fit with the addition of a random intercept for language will provide support for variability in semantic priming across languages.

Metadata for search engines
name value
1 Erin M. Buchanan
x
observation
fix_sender
response
response_action
ended_on
duration
time_run
time_render
time_show
time_end
time_commit
timestamp
time_switch
word
class
correct_response
correct
original_duration
Z_RT
keep
keep_participant
keep_participant_answered

#Variables

observation

Unique participant ID number.

Distribution

Distribution of values for observation

Distribution of values for observation

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
observation Unique participant ID number. character 0 1 757 0 14 14 0

fix_sender

The sender_id column in a sortable format. You can sort the data by observation and this column to ensure it is in trial order.

Distribution

Distribution of values for fix_sender

Distribution of values for fix_sender

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
fix_sender The sender_id column in a sortable format. You can sort the data by observation and this column to ensure it is in trial order. character 0 1 800 0 11 11 0

response

Participant response to the trial.

Distribution

Distribution of values for response

Distribution of values for response

16151 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
response Participant response to the trial. character 16151 0.9717089 2 0 4 7 0

response_action

Keypress used to indicate their response to the trial.

Distribution

Distribution of values for response_action

Distribution of values for response_action

17677 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
response_action Keypress used to indicate their response to the trial. character 17677 0.9690359 2 0 11 11 0

ended_on

How the trial ended (timeout, form submit, completion, response).

Distribution

Distribution of values for ended_on

Distribution of values for ended_on

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
ended_on How the trial ended (timeout, form submit, completion, response). character 0 1 2 0 7 8 0

duration

The duration in milliseconds of the entire trial from time shown to time end. This variable was set to NA if the trial was incorrectly answered, too long (3000+ms) or too short (<=160ms). The original duration is also preserved.

Distribution

Distribution of values for duration

Distribution of values for duration

58151 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
duration The duration in milliseconds of the entire trial from time shown to time end. This variable was set to NA if the trial was incorrectly answered, too long (3000+ms) or too short (<=160ms). The original duration is also preserved. numeric 58151 0.8981392 160 755 2995 887.1916 414.2094 ▇▇▂▁▁

time_run

The time in milliseconds from the start of the experiment it took to run (start to display) the trial.

Distribution

Distribution of values for time_run

Distribution of values for time_run

1700 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
time_run The time in milliseconds from the start of the experiment it took to run (start to display) the trial. numeric 1700 0.9970222 38370 851402 1.1e+08 1108711 4058049 ▇▁▁▁▁

time_render

The time in milliseconds from the start of the experiment it took to render (prepare, get ready for) the trial.

Distribution

Distribution of values for time_render

Distribution of values for time_render

1700 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
time_render The time in milliseconds from the start of the experiment it took to render (prepare, get ready for) the trial. numeric 1700 0.9970222 38369 851401 1.1e+08 1108709 4058049 ▇▁▁▁▁

time_show

The time in milliseconds from the start of the experiment it took to show the trial on the screen to the participant.

Distribution

Distribution of values for time_show

Distribution of values for time_show

4057 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
time_show The time in milliseconds from the start of the experiment it took to show the trial on the screen to the participant. numeric 4057 0.9928935 38383 851954 1.1e+08 1106518 4062001 ▇▁▁▁▁

time_end

The time in milliseconds from the start of the experiment it took to end the current trial.

Distribution

Distribution of values for time_end

Distribution of values for time_end

1700 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
time_end The time in milliseconds from the start of the experiment it took to end the current trial. numeric 1700 0.9970222 39622 852268 1.1e+08 1109721 4058186 ▇▁▁▁▁

time_commit

The time in milliseconds from the start of the experiment it took to save the current trial.

Distribution

Distribution of values for time_commit

Distribution of values for time_commit

1700 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
time_commit The time in milliseconds from the start of the experiment it took to save the current trial. numeric 1700 0.9970222 39623 852270 1.1e+08 1109723 4058186 ▇▁▁▁▁

timestamp

The approximate timestamp of the trial in UTC server time.

Distribution

## 568725  unique, categorical values, so not shown.

1700 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique min median max
timestamp The approximate timestamp of the trial in UTC server time. POSIXct 1700 0.9970222 568725 2022-12-20 17:47:51.709 2023-10-27 13:21:45.922 2024-02-13 22:09:46.042

time_switch

The time in milliseconds from the start of the experiment it took to switch between the previous trial and the current trial.

Distribution

Distribution of values for time_switch

Distribution of values for time_switch

1943 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
time_switch The time in milliseconds from the start of the experiment it took to switch between the previous trial and the current trial. numeric 1943 0.9965965 39651 852243 1.1e+08 1109572 4057532 ▇▁▁▁▁

word

The string of letters/characters shown on the screen for the trial.

Distribution

Distribution of values for word

Distribution of values for word

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
word The string of letters/characters shown on the screen for the trial. character 0 1 3917 0 2 20 0

class

The type of stimuli shown on the screen (word or nonword).

Distribution

Distribution of values for class

Distribution of values for class

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
class The type of stimuli shown on the screen (word or nonword). character 0 1 2 0 4 7 0

correct_response

The correct answer for the trial.

Distribution

Distribution of values for correct_response

Distribution of values for correct_response

16151 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
correct_response The correct answer for the trial. character 16151 0.9717089 2 0 4 7 0

correct

A logical variable indicating if the participant got the trial answer correct.

Distribution

Distribution of values for correct

Distribution of values for correct

17677 missing values.

Summary statistics

name label data_type n_missing complete_rate count mean
correct A logical variable indicating if the participant got the trial answer correct. logical 17677 0.9690359 TRU: 517470, FAL: 35740 0.9353952

original_duration

The duration in milliseconds of the entire trial from time shown to time end.

Distribution

Distribution of values for original_duration

Distribution of values for original_duration

1700 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
original_duration The duration in milliseconds of the entire trial from time shown to time end. numeric 1700 0.9970222 -14 759 1620079 950.6084 3507.883 ▇▁▁▁▁

Z_RT

The Z-scored response latency (by participant) of the duration column.

Distribution

Distribution of values for Z_RT

Distribution of values for Z_RT

58151 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
Z_RT The Z-scored response latency (by participant) of the duration column. numeric 58151 0.8981392 -2.9 -0.26 12 0 0.9992723 ▇▃▁▁▁

keep

If the trial level data should be kept based on our exclusion rules (not too long < 3000 ms, not too short > 160ms, correctly answered).

Distribution

Distribution of values for keep

Distribution of values for keep

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
keep If the trial level data should be kept based on our exclusion rules (not too long < 3000 ms, not too short > 160ms, correctly answered). character 0 1 2 0 4 7 0

keep_participant

If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials SEEN.

Distribution

Distribution of values for keep_participant

Distribution of values for keep_participant

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
keep_participant If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials SEEN. character 0 1 2 0 4 7 0

keep_participant_answered

If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials ANSWERED.

Distribution

Distribution of values for keep_participant_answered

Distribution of values for keep_participant_answered

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
keep_participant_answered If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials ANSWERED. character 0 1 2 0 4 7 0

Missingness report

Codebook table

JSON-LD metadata

The following JSON-LD can be found by search engines, if you share this codebook publicly on the web.

{
  "name": "Semantic Priming Across Many Languages Trial Level Data",
  "description": "This dataset contains the processed trial data of the data collection from the SPAML project (example in Serbian, but all datasets are the same). The data is presented here in long format, with each trial representing one row in the data. All other trials (consent/demographic screens), fillers, fixation crosses, etc. have been excluded in this version. \n\nSemantic priming has been studied for nearly 50 years across various experimental manipulations and theoretical frameworks. These studies provide insight into the cognitive underpinnings of semantic representations in both healthy and clinical populations; however, they have suffered from several issues including generally low sample sizes and a lack of diversity in linguistic implementations. Here, we will test the size and the variability of the semantic priming effect across ten languages by creating a large database of semantic priming values, based on an adaptive sampling procedure. Differences in response latencies between related word-pair conditions and unrelated word-pair conditions (i.e., difference score confidence interval is greater than zero) will allow quantifying evidence for semantic priming, whereas improvements in model fit with the addition of a random intercept for language will provide support for variability in semantic priming across languages.\n\n\n## Table of variables\nThis table contains variable names, labels, and number of missing values.\nSee the complete codebook for more.\n\n[truncated]\n\n### Note\nThis dataset was automatically described using the [codebook R package](https://rubenarslan.github.io/codebook/) (version 0.9.2).",
  "identifier": "https://doi.org/10.5281/zenodo.10888833",
  "creator": "Erin M. Buchanan",
  "citation": "Buchanan, E., Cuccolo, K., Heyman, T., Iyer, A., Coles, N., Lewis Jr, N., Peters, K., van Berkel, N., Taylor, J., Van't Veer, A. E., Montefinese, M., Valentine, K. D., Maxwell, N., Türkan, B. N., Williams, G., Oliveros-Chacana, J. C., Röer, J., Fini, C., Acar, O., … Lewis, S. C. (2024). SemanticPriming/SPAML: SPAML v1 Data Release (v1.0.0) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.10888833",
  "url": "https://github.com/SemanticPriming/SPAML/releases/",
  "datePublished": "2024-05-01",
  "temporalCoverage": "2022-2024",
  "spatialCoverage": "Online",
  "keywords": ["observation", "fix_sender", "response", "response_action", "ended_on", "duration", "time_run", "time_render", "time_show", "time_end", "time_commit", "timestamp", "time_switch", "word", "class", "correct_response", "correct", "original_duration", "Z_RT", "keep", "keep_participant", "keep_participant_answered"],
  "@context": "http://schema.org/",
  "@type": "Dataset",
  "variableMeasured": [
    {
      "name": "observation",
      "description": "Unique participant ID number.",
      "@type": "propertyValue"
    },
    {
      "name": "fix_sender",
      "description": "The sender_id column in a sortable format. You can sort the data by observation and this column to ensure it is in trial order.",
      "@type": "propertyValue"
    },
    {
      "name": "response",
      "description": "Participant response to the trial.",
      "@type": "propertyValue"
    },
    {
      "name": "response_action",
      "description": "Keypress used to indicate their response to the trial.",
      "@type": "propertyValue"
    },
    {
      "name": "ended_on",
      "description": "How the trial ended (timeout, form submit, completion, response).",
      "@type": "propertyValue"
    },
    {
      "name": "duration",
      "description": "The duration in milliseconds of the entire trial from time shown to time end. This variable was set to NA if the trial was incorrectly answered, too long (3000+ms) or too short (<=160ms). The original duration is also preserved.",
      "@type": "propertyValue"
    },
    {
      "name": "time_run",
      "description": "The time in milliseconds from the start of the experiment it took to run (start to display) the trial.",
      "@type": "propertyValue"
    },
    {
      "name": "time_render",
      "description": "The time in milliseconds from the start of the experiment it took to render (prepare, get ready for) the trial.",
      "@type": "propertyValue"
    },
    {
      "name": "time_show",
      "description": "The time in milliseconds from the start of the experiment it took to show the trial on the screen to the participant.",
      "@type": "propertyValue"
    },
    {
      "name": "time_end",
      "description": "The time in milliseconds from the start of the experiment it took to end the current trial.",
      "@type": "propertyValue"
    },
    {
      "name": "time_commit",
      "description": "The time in milliseconds from the start of the experiment it took to save the current trial.",
      "@type": "propertyValue"
    },
    {
      "name": "timestamp",
      "description": "The approximate timestamp of the trial in UTC server time.",
      "@type": "propertyValue"
    },
    {
      "name": "time_switch",
      "description": "The time in milliseconds from the start of the experiment it took to switch between the previous trial and the current trial.",
      "@type": "propertyValue"
    },
    {
      "name": "word",
      "description": "The string of letters/characters shown on the screen for the trial.",
      "@type": "propertyValue"
    },
    {
      "name": "class",
      "description": "The type of stimuli shown on the screen (word or nonword).",
      "@type": "propertyValue"
    },
    {
      "name": "correct_response",
      "description": "The correct answer for the trial.",
      "@type": "propertyValue"
    },
    {
      "name": "correct",
      "description": "A logical variable indicating if the participant got the trial answer correct.",
      "@type": "propertyValue"
    },
    {
      "name": "original_duration",
      "description": "The duration in milliseconds of the entire trial from time shown to time end.",
      "@type": "propertyValue"
    },
    {
      "name": "Z_RT",
      "description": "The Z-scored response latency (by participant) of the duration column.",
      "@type": "propertyValue"
    },
    {
      "name": "keep",
      "description": "If the trial level data should be kept based on our exclusion rules (not too long < 3000 ms, not too short > 160ms, correctly answered).",
      "@type": "propertyValue"
    },
    {
      "name": "keep_participant",
      "description": "If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials SEEN.",
      "@type": "propertyValue"
    },
    {
      "name": "keep_participant_answered",
      "description": "If the participant should be kept based on our exclusion rules: must be 18 years old, saw at least 100 trials, correctly answered at least 80% of the trials ANSWERED.",
      "@type": "propertyValue"
    }
  ]
}`